# ==================================
#
#  Code for replicating:
# "Positioning Under Alternative Electoral Systems: Evidence From Japanese Candidate Election Manifestos"
#  Amy Catalinac, NYU
#
# ==================================


# ==================================
# Pre-processing

# I created 8 election-specific TDMs by:
# -- using the R package "RMeCab" to parse out words, stem words, and keep words whose part-of-speech was 
# classified as "����", "�`�e��", "����", and "����" discard "�㖼��" and "�񎩗�" (see below for code);
# -- deleting punctuation, numbers, and words appearing in less than 0.5% of the manifestos

# I combined these 8 TDMs into one large TDM and:
# -- constructed a stop words list and deleted these words (see "Catalinac_master_badterms.txt");
# -- standardized different permutations of the same word (correcting for words being spelled differently);
# -- deleted words appearing in less than 0.5% of the manifestos

# Then, I split these up into 8 TDMs pertaining to each election and:
# -- deleted words that did not appear at least one document in that election;
# These files saved as as:
# "1986_end.csv"
# "1990_end.csv"
# "1993_end.csv"
# "1996_end.csv"
# "2000_end.csv"
# "2003_end.csv"
# "2005_end.csv"
# "2009_end.csv"
# (NB: they are encoded in Shift-JIS; Japanese characters can be viewed in Excel in Windows
# (by using the "import data" function); will need to be re-encoded if using MAC;

# RMeCab code is:

library(RMeCab)
res <- docDF("1986_manifestos", type = 1, N = 1) # e.g. 1986 manifestos
res2 <- res[res$POS1 %in% c("����","�`�e��",�@"����", "����") & 
              !(res$POS2 %in% c("�㖼��", "�񎩗�")), ]
write.csv(res2, file = "TDM_1986_manifestos.csv")


